{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.1 Cleaning Text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Interrobang. By aishwarya Henriette',\n",
       " 'Parking And Going. By Karl Gautier',\n",
       " 'Today Is The night. By Jarek Prakash']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text_data = [\"  Interrobang. By aishwarya Henriette    \",\n",
    "            \"Parking And Going. By Karl Gautier\",\n",
    "            \"    Today Is The night. By Jarek Prakash\"]\n",
    "\n",
    "# strip whitespaces\n",
    "strip_whitespace = [string.strip() for string in text_data]\n",
    "strip_whitespace"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Interrobang By aishwarya Henriette',\n",
       " 'Parking And Going By Karl Gautier',\n",
       " 'Today Is The night By Jarek Prakash']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "remove_periods = [string.replace(\".\", \"\") for string in strip_whitespace]\n",
    "remove_periods"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['INTERROBANG BY AISHWARYA HENRIETTE',\n",
       " 'PARKING AND GOING BY KARL GAUTIER',\n",
       " 'TODAY IS THE NIGHT BY JAREK PRAKASH']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def capitalizer(string: str) -> str:\n",
    "    return string.upper()\n",
    "\n",
    "[capitalizer(string) for string in remove_periods]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['XXXXXXXXXXX XX XXXXXXXXX XXXXXXXXX',\n",
       " 'XXXXXXX XXX XXXXX XX XXXX XXXXXXX',\n",
       " 'XXXXX XX XXX XXXXX XX XXXXX XXXXXXX']"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import re\n",
    "\n",
    "def replace_letters_with_X(string: str) -> str:\n",
    "    return re.sub(r\"[a-zA-Z]\", \"X\", string)\n",
    "\n",
    "[replace_letters_with_X(string) for string in remove_periods]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### See Also\n",
    "* Beginners Tutorial for Regular Expressions in Python (https://www.analyticsvidhya.com/blog/2015/06/regular-expression-python/)\n",
    "\n",
    "## 6.2 Parsing and Cleaning HTML"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Yan Chin'"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "\n",
    "html = \"\"\"\n",
    "    <div class='full_name'><span style='font-weight:bold'>Yan</span> Chin</div>\n",
    "\"\"\"\n",
    "\n",
    "soup = BeautifulSoup(html)\n",
    "\n",
    "soup.find(\"div\", {\"class\": \"full_name\"}).text"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### See Also\n",
    "* Beautiful Soup documentation (https://www.crummy.com/software/BeautifulSoup/bs4/doc/)\n",
    "\n",
    "## 6.3 Removing Punctuation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Hi I Love This Song', '10000 Agree LoveIT', 'Right']"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import unicodedata\n",
    "import sys\n",
    "\n",
    "text_data = ['Hi!!! I. Love. This. Song.....', '10000% Agree!!!! #LoveIT', 'Right?!?!']\n",
    "\n",
    "# create a dictionary of punctuation characters\n",
    "punctuation = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))\n",
    "\n",
    "# for each string, remove any punctuation characters\n",
    "[string.translate(punctuation) for string in text_data]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.4 Tokenizing Text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['The', 'science', 'of', 'today', 'is', 'the', 'technology', 'of', 'tommorrow']"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from nltk.tokenize import word_tokenize\n",
    "string = \"The science of today is the technology of tommorrow\"\n",
    "\n",
    "# tokenize words\n",
    "word_tokenize(string)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['The science of today is the technology of tommorw.', 'Tommorrow is today']"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from nltk.tokenize import sent_tokenize\n",
    "string = \"The science of today is the technology of tommorw. Tommorrow is today\"\n",
    "\n",
    "# tokenize sentences\n",
    "sent_tokenize(string)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.5 Removing Stop Words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package stopwords to /Users/f00/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n"
     ]
    },
    {
     "ename": "OSError",
     "evalue": "No such file or directory: '/Users/f00/nltk_data/corpora/stopwords/english'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mOSError\u001b[0m                                   Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-38-73e1e661dfff>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mtokenized_words\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'i'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'am'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'going'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'to'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'go'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'to'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'the'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'store'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'and'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'park'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mstop_words\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstopwords\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'english'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[0;31m# remove stop words\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda/envs/machine_learning_cookbook/lib/python3.6/site-packages/nltk/corpus/reader/wordlist.py\u001b[0m in \u001b[0;36mwords\u001b[0;34m(self, fileids, ignore_lines_startswith)\u001b[0m\n\u001b[1;32m     20\u001b[0m     \"\"\"\n\u001b[1;32m     21\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mwords\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfileids\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mignore_lines_startswith\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'\\n'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 22\u001b[0;31m         return [line for line in line_tokenize(self.raw(fileids))\n\u001b[0m\u001b[1;32m     23\u001b[0m                 if not line.startswith(ignore_lines_startswith)]\n\u001b[1;32m     24\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda/envs/machine_learning_cookbook/lib/python3.6/site-packages/nltk/corpus/reader/wordlist.py\u001b[0m in \u001b[0;36mraw\u001b[0;34m(self, fileids)\u001b[0m\n\u001b[1;32m     26\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mfileids\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mfileids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fileids\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     27\u001b[0m         \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfileids\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstring_types\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mfileids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mfileids\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 28\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mconcat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mf\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mfileids\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     29\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     30\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda/envs/machine_learning_cookbook/lib/python3.6/site-packages/nltk/corpus/reader/wordlist.py\u001b[0m in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m     26\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mfileids\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mfileids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fileids\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     27\u001b[0m         \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfileids\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstring_types\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mfileids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mfileids\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 28\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mconcat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mf\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mfileids\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     29\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     30\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda/envs/machine_learning_cookbook/lib/python3.6/site-packages/nltk/corpus/reader/api.py\u001b[0m in \u001b[0;36mopen\u001b[0;34m(self, file)\u001b[0m\n\u001b[1;32m    211\u001b[0m         \"\"\"\n\u001b[1;32m    212\u001b[0m         \u001b[0mencoding\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencoding\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 213\u001b[0;31m         \u001b[0mstream\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_root\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mencoding\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    214\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mstream\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    215\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda/envs/machine_learning_cookbook/lib/python3.6/site-packages/nltk/data.py\u001b[0m in \u001b[0;36mjoin\u001b[0;34m(self, fileid)\u001b[0m\n\u001b[1;32m    338\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfileid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    339\u001b[0m         \u001b[0m_path\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfileid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 340\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mFileSystemPathPointer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    341\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    342\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__repr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda/envs/machine_learning_cookbook/lib/python3.6/site-packages/nltk/compat.py\u001b[0m in \u001b[0;36m_decorator\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    219\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_decorator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    220\u001b[0m         \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0madd_py3_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 221\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0minit_func\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    222\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mwraps\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minit_func\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_decorator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    223\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda/envs/machine_learning_cookbook/lib/python3.6/site-packages/nltk/data.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, _path)\u001b[0m\n\u001b[1;32m    316\u001b[0m         \u001b[0m_path\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mabspath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    317\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 318\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0mIOError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'No such file or directory: %r'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0m_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    319\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_path\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_path\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    320\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mOSError\u001b[0m: No such file or directory: '/Users/f00/nltk_data/corpora/stopwords/english'"
     ]
    }
   ],
   "source": [
    "from nltk.corpus import stopwords\n",
    "import nltk\n",
    "nltk.download('stopwords')\n",
    "\n",
    "tokenized_words = ['i', 'am', 'going', 'to', 'go', 'to', 'the', 'store', 'and', 'park']\n",
    "\n",
    "stop_words = stopwords.words('english')\n",
    "\n",
    "# remove stop words\n",
    "[word for word in tokenized_words if word not in stop_words]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.6 Stemming Words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['i', 'am', 'humbl', 'by', 'thi', 'tradit', 'meet']"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from nltk.stem.porter import PorterStemmer\n",
    "\n",
    "tokenized_words = ['i', 'am', 'humbled', 'by', 'this', 'traditional', 'meeting']\n",
    "\n",
    "# create stemmer\n",
    "porter = PorterStemmer()\n",
    "\n",
    "# apply stemmer\n",
    "[porter.stem(word) for word in tokenized_words]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### See Also\n",
    "* Porter Stemming Algorithm (https://tartarus.org/martin/PorterStemmer/)\n",
    "\n",
    "## 6.7 Tagging Part of Speech"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package averaged_perceptron_tagger to\n",
      "[nltk_data]     /Users/f00/nltk_data...\n",
      "[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[('Chris', 'NNP'), ('loved', 'VBD'), ('outdoor', 'RP'), ('running', 'VBG')]"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from nltk import pos_tag\n",
    "from nltk import word_tokenize\n",
    "import nltk\n",
    "nltk.download('averaged_perceptron_tagger')\n",
    "\n",
    "text_data = \"Chris loved outdoor running\"\n",
    "\n",
    "text_tagged = pos_tag(word_tokenize(text_data))\n",
    "\n",
    "text_tagged"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "NLTK uses the Penn Treebank parts for speech tags, some examples:\n",
    "\n",
    "| Tag | Parts of Speech |\n",
    "|---  |-----------------|\n",
    "|NNP| Proper noun, singular|\n",
    "|NN| Noun, singular or mass|\n",
    "|RB| Adverb|\n",
    "|VBD| Verb, past tense|\n",
    "|VBG| Verb, gerund or present participle|\n",
    "|JJ| Adjective|\n",
    "|PRP| Personal pronoun|\n",
    "\n",
    "Once the text has been tagged, we can use the tags to find certain parts of speech. For example, here are all nouns:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Chris']"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[word for word, tag in text_tagged if tag in ['NN', 'NNS', 'NNP', 'NNPS']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1, 1, 0, 1, 0, 1, 1, 1, 0],\n",
       "       [1, 0, 1, 1, 0, 0, 0, 0, 1],\n",
       "       [1, 0, 1, 1, 1, 0, 0, 0, 1]])"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.preprocessing import MultiLabelBinarizer\n",
    "\n",
    "tweets = [\"I am eating a burrito for breakfast\",\n",
    "         \"Political science is an amazing field\",\n",
    "         \"San Francisco is an awesome city\"]\n",
    "\n",
    "tagged_tweets = []\n",
    "\n",
    "# tag each word and each tweet\n",
    "for tweet in tweets:\n",
    "    tweet_tag = nltk.pos_tag(word_tokenize(tweet))\n",
    "    tagged_tweets.append([tag for word, tag in tweet_tag])\n",
    "\n",
    "# use one hot encoding to convert the tags into features\n",
    "one_hot_multi = MultiLabelBinarizer()\n",
    "one_hot_multi.fit_transform(tagged_tweets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['DT', 'IN', 'JJ', 'NN', 'NNP', 'PRP', 'VBG', 'VBP', 'VBZ'],\n",
       "      dtype=object)"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# show feature names\n",
    "one_hot_multi.classes_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package brown to /Users/f00/nltk_data...\n",
      "[nltk_data]   Unzipping corpora/brown.zip.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.8174734002697437"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from nltk.corpus import brown\n",
    "from nltk.tag import UnigramTagger\n",
    "from nltk.tag import BigramTagger\n",
    "from nltk.tag import TrigramTagger\n",
    "import nltk\n",
    "nltk.download('brown')\n",
    "    \n",
    "# get some text from the Brown\n",
    "sentences = brown.tagged_sents(categories='news')\n",
    "\n",
    "# split into 4000 stences for training and 623 for testing\n",
    "train = sentences[:4000]\n",
    "test = sentences[4000:]\n",
    "\n",
    "# create backoff tagger\n",
    "unigram = UnigramTagger(train)\n",
    "bigram = BigramTagger(train, backoff=unigram)\n",
    "trigram = TrigramTagger(train, backoff=bigram)\n",
    "\n",
    "trigram.evaluate(test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### See Also\n",
    "* https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html\n",
    "\n",
    "## 6.8 Encoding Text as a Bag of Words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<3x8 sparse matrix of type '<class 'numpy.int64'>'\n",
       "\twith 8 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "\n",
    "text_data = np.array(['I love Brazil. Brazil!', 'Sweden is best', 'Germany beats both'])\n",
    "\n",
    "count = CountVectorizer()\n",
    "bag_of_words = count.fit_transform(text_data)\n",
    "\n",
    "bag_of_words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0, 0, 0, 2, 0, 0, 1, 0],\n",
       "       [0, 1, 0, 0, 0, 1, 0, 1],\n",
       "       [1, 0, 1, 0, 1, 0, 0, 0]], dtype=int64)"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bag_of_words.toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love', 'sweden']"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "count.get_feature_names()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[2],\n",
       "       [0],\n",
       "       [0]])"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "count_2gram = CountVectorizer(ngram_range=(1,2), stop_words='english', vocabulary=['brazil'])\n",
    "bag = count_2gram.fit_transform(text_data)\n",
    "bag.toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'brazil': 0}"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "count_2gram.vocabulary_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### See Also\n",
    "* n-gram (https://en.wikipedia.org/wiki/N-gram)\n",
    "* bag of words meets bags of popcorn (https://www.kaggle.com/c/word2vec-nlp-tutorial)\n",
    "\n",
    "## 6.9 Weighting Word Importance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<3x8 sparse matrix of type '<class 'numpy.float64'>'\n",
       "\twith 8 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "text_data = np.array(['I love Brazil. Brazil!', 'Sweden is best', 'Germany beats both'])\n",
    "\n",
    "# create the tf-idf feature matrix\n",
    "tfidf = TfidfVectorizer()\n",
    "feature_matrix = tfidf.fit_transform(text_data)\n",
    "\n",
    "feature_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.        , 0.        , 0.        , 0.89442719, 0.        ,\n",
       "        0.        , 0.4472136 , 0.        ],\n",
       "       [0.        , 0.57735027, 0.        , 0.        , 0.        ,\n",
       "        0.57735027, 0.        , 0.57735027],\n",
       "       [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027,\n",
       "        0.        , 0.        , 0.        ]])"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_matrix.toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'love': 6,\n",
       " 'brazil': 3,\n",
       " 'sweden': 7,\n",
       " 'is': 5,\n",
       " 'best': 1,\n",
       " 'germany': 4,\n",
       " 'beats': 0,\n",
       " 'both': 2}"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tfidf.vocabulary_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "$$\n",
    "tfidf(t, d) = tf(t,d) * idf(t)\n",
    "$$\n",
    "\n",
    "where $t$ is a word\n",
    "\n",
    "$d$ is a document\n",
    "\n",
    "$$\n",
    "idf(t) = log(\\frac{1 + n_d}{1 + df(d, t}) +1\n",
    "$$\n",
    "\n",
    "where $n_d$ is the number of documents and \n",
    "\n",
    "$df(d,t)$ is term, $t$'s document frequency (i.e. number of documents where the term appears)\n",
    "\n",
    "### See Also\n",
    "* scikit-learn documentation: tf-idf term weighting (http://scikit-learn.org/stable/modules/feature_extraction.html#tfidf-term-weighting)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:machine_learning_cookbook]",
   "language": "python",
   "name": "conda-env-machine_learning_cookbook-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
